Setup

knitr::opts_chunk$set(echo = TRUE)

Packages

pkgs <- c("tidyverse", "kableExtra", "plotly")
invisible(lapply(pkgs, library, character.only = T))
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Warning: package 'kableExtra' was built under R version 4.2.3
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## Warning: package 'plotly' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Paths

root <- "D:/Documents/DegreesNYC/Degrees_data/Project Examples"
indir <- file.path(root, "Data")
intdir <- file.path(root, "Intermediate")
outdir <- file.path(root, "Output")

Project

Load Data

# data <- read.csv(file.path(indir, "mock_data.csv"))
data <- readxl::read_xlsx(file.path(intdir, "nyc_student_survey_2022.xlsx"), sheet = "Total")
data
## # A tibble: 1,127 × 11
##    DBN   Schoo…¹ Total…² Total…³ Total…⁴ Colla…⁵ Effec…⁶ Rigor…⁷ Suppo…⁸ Stron…⁹
##    <chr> <chr>     <dbl>   <dbl>   <dbl> <chr>   <chr>   <chr>   <chr>   <chr>  
##  1 01M0… P.S. 0…    0.15  0.694     0.71 N/A     N/A     N/A     N/A     N/A    
##  2 01M1… P.S. 1…    0.3   0.0552    0.76 N/A     N/A     N/A     N/A     N/A    
##  3 01M1… P.S. 1…    0.69  0.982     0.93 N/A     N/A     N/A     N/A     N/A    
##  4 01M1… P.S. 1…    0.65  0.689     0.8  N/A     N/A     N/A     N/A     N/A    
##  5 01M2… Orchar…    0.34  0.514     0.92 N/A     N/A     N/A     N/A     N/A    
##  6 01M3… Univer…    0.64  0.840     0.93 N/A     N/A     N/A     N/A     N/A    
##  7 01M3… School…    0.05  0.383     0.45 N/A     N/A     N/A     N/A     N/A    
##  8 01M4… Univer…    0.38  0.794     1    N/A     N/A     N/A     N/A     N/A    
##  9 01M4… East S…    0.68  0.782     0.68 N/A     N/A     N/A     N/A     N/A    
## 10 01M4… Forsyt…    0.16  0.330     0.73 N/A     N/A     N/A     N/A     N/A    
## # … with 1,117 more rows, 1 more variable: `Trust Score` <chr>, and abbreviated
## #   variable names ¹​`School Name`, ²​`Total Parent Response Rate`,
## #   ³​`Total Student Response Rate`, ⁴​`Total Teacher Response Rate`,
## #   ⁵​`Collaborative Teachers Score`, ⁶​`Effective School Leadership Score`,
## #   ⁷​`Rigorous Instruction Score`, ⁸​`Supportive Environment Score`,
## #   ⁹​`Strong Family-Community Ties Score`

clean Data

data %>%
  # Keep only Manhattan schools
  filter(grepl("\\d{2}M\\d{3}", DBN)) %>%
  select(-c(ends_with("Score"))) %>%
  # Extract district
  mutate(district = substr(DBN, 1, 2)) %>%
  {data <<- .}

Descriptive Statistics

dstats <- function(x, y) {
  name <- str_remove_all(y, "\\s")
  mean_name <- paste0(name, "_mean")
  min_name <- paste0(name, "_min")
  max_name <- paste0(name, "_max")
  x %>%
    group_by(district) %>%
    summarize(mean_name = mean(!!sym(y)),
              min_name = min(!!sym(y)),
              max_name = max(!!sym(y))) %>%
    setNames(c("District", mean_name, min_name, max_name))
}

map_dfc(c("Total Parent Response Rate", "Total Student Response Rate", "Total Teacher Response Rate"), function(x) dstats(data, x)) %>%
  select(-matches("District...[^1]")) %>%
  rename(District = `District...1`) %>%
  kable() %>%
  kable_paper()
## New names:
## • `District` -> `District...1`
## • `District` -> `District...5`
## • `District` -> `District...9`
District TotalParentResponseRate_mean TotalParentResponseRate_min TotalParentResponseRate_max TotalStudentResponseRate_mean TotalStudentResponseRate_min TotalStudentResponseRate_max TotalTeacherResponseRate_mean TotalTeacherResponseRate_min TotalTeacherResponseRate_max
01 0.3713333 0.04 0.69 0.6025339 0.0552147 0.9821429 0.7800000 0.45 1.00
02 0.2993182 0.00 0.91 0.6565823 0.0062241 0.9646018 0.7617045 0.00 1.00
03 0.3706452 0.04 1.00 0.6445893 0.0100000 1.0000000 0.7383871 0.00 1.00
04 0.3663158 0.00 0.92 0.7724843 0.2476190 0.9813505 0.7705263 0.34 1.00
05 0.2929412 0.06 0.84 0.5669682 0.1597222 1.0000000 0.6382353 0.31 1.00
06 0.6314815 0.04 1.00 0.7377922 0.0143885 1.0000000 0.7570370 0.29 1.00
75 0.3677778 0.15 1.00 0.5494369 0.2500000 0.9757576 0.5177778 0.22 0.78
84 0.3497619 0.00 0.97 0.3642449 0.0000000 0.9971671 0.4785714 0.00 1.00
data_dist <- map_dfc(c("Total Parent Response Rate", "Total Student Response Rate", "Total Teacher Response Rate"), function(x) dstats(data, x)) %>%
  select(-matches("District...[^1]")) %>%
  rename(District = `District...1`)
## New names:
## • `District` -> `District...1`
## • `District` -> `District...5`
## • `District` -> `District...9`

Visualizations

data_dist %>%
  rename_with(~ str_to_title(gsub("(Total)(\\w+)(Response)(Rate)_(\\w+)$", "\\1 \\2 \\3 \\4 \\5", .x, perl = T)), starts_with("Total")) %>%
  select(District, `Total Student Response Rate Mean`) %>%
  ggplot(aes(x = District, y = `Total Student Response Rate Mean`*100, fill = District, label = round(`Total Student Response Rate Mean`*100, 2))) +
  geom_bar(stat = "identity") +
  geom_text(vjust= -.5) +
  scale_y_continuous(limits = c(0, 100)) +
  ggtitle("Mean Student Response Rate by District") +
  ylab("Response Rate") +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

heatmap <- data_dist %>%
  group_by(District) %>%
  rename_with(~ str_to_title(gsub("(Total)(\\w+)(Response)(Rate)_(\\w+)$", "\\1 \\2 \\3 \\4 \\5", .x, perl = T)), starts_with("Total")) %>%
  select(District, ends_with("Mean")) %>%
  pivot_longer(ends_with("Mean"), names_to = "Outcome", values_to = "Value") %>%
  ggplot(aes(x = District, y = Outcome, fill = Value)) +
  geom_tile() +
  ggtitle("Mean Response Rate by District")

ggplotly(heatmap)